knitr::opts_chunk$set(echo=TRUE, eval=TRUE, message=FALSE, warning = FALSE)

Load package

# library statements 
# read in data
library(dplyr)
library(readr)
library(broom)
library(ggplot2)
library(tidymodels) 
library(stringr)
library(tidyverse)
library(gplots)
library(lubridate)
library(countrycode)
library(gapminder)
library(purrr)
library(stringr)
library(vip)
library(probably)
library(plotly)

tidymodels_prefer() # Resolves conflicts, prefers tidymodel functions
set.seed(189) 

Clean data

(DON’T RUN THIS CHUNK. WE’VE INCLUDED THE CLEAN DATA IN DATA.CSV)

Hotel_Reviews <- read_csv("Hotel_Reviews.csv")

Hotel_Reviews_Cleaned <- Hotel_Reviews %>% na_if("") %>% na.omit
Hotel_Reviews_Cleaned <- Hotel_Reviews_Cleaned %>% mutate(Hotel_Country = word(Hotel_Address,-1))

Hotel_Reviews_Cleaned$Hotel_Country[Hotel_Reviews_Cleaned$Hotel_Country == "Kingdom"] <- "United Kingdom"
unique(Hotel_Reviews_Cleaned$Hotel_Country)

Hotels_Cleaned <-Hotel_Reviews_Cleaned %>%
   mutate(Reviewer_continent = countrycode(sourcevar = Hotel_Reviews_Cleaned$Reviewer_Nationality,
                             origin = "country.name",
                             destination = "continent"))
 
Hotels_Cleaned <-Hotels_Cleaned  %>%
   separate(Review_Date, c("month", "day", "year"), sep = "/")
 
Hotels_Cleaned <- Hotels_Cleaned %>% mutate(Month = month.name[as.numeric(month)])
 
Hotels_Cleaned <- Hotels_Cleaned %>% 
  mutate(season = ifelse(month %in% 10:12, "Fall",
                                ifelse(month %in% 1:3, "Winter",
                                       ifelse(month %in% 4:6, "Spring",
                                              "Summer"))))

Hotels_Cleaned <- Hotels_Cleaned[, -which(names(Hotels_Cleaned) == "Negative_Review")]
Hotels_Cleaned <- Hotels_Cleaned[, -which(names(Hotels_Cleaned) == "Positive_Review")]
Hotels_Cleaned <- Hotels_Cleaned[, -which(names(Hotels_Cleaned) == "days_since_review")]

write.csv(cleanData,".../data.csv", row.names = FALSE)
Hotel_Reviews <- read_csv("data.csv")
# Delete unnecessary colunmns
Hotel_Reviews$Reviewer_Average_Difference <- Hotel_Reviews$Reviewer_Score - Hotel_Reviews$Average_Score 

Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Hotel_Address")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Hotel_Name")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Tags")]

Regression Model

LASSO

# Creation of CV folds
data_cv10 <- vfold_cv(Hotel_Reviews,v = 10)

# Lasso Model Spec with tune
lm_lasso_spec_tune <- 
  linear_reg() %>%
  set_args(mixture = 1, penalty = tune()) %>% 
  set_engine(engine = 'glmnet') %>% 
  set_mode('regression')  

# Recipes
full_rec <- recipe(Reviewer_Average_Difference ~., data = Hotel_Reviews) %>% 
    step_rm(month)%>%
    step_nzv(all_predictors()) %>% # removes variables with the same value
    step_unknown(all_nominal_predictors()) %>% 
    step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
    step_corr(all_numeric())%>%
    step_dummy(all_nominal_predictors())  # creates indicator variables for categorical variables

# Workflows
lasso_wf_tune <- workflow() %>% 
  add_recipe(full_rec) %>%
  add_model(lm_lasso_spec_tune) 

# Tune Model (trying a variety of values of Lambda penalty)
penalty_grid <- grid_regular(
  penalty(range = c(-4, 1)), 
  levels = 30)

tune_output <- tune_grid( 
  lasso_wf_tune, 
  resamples = data_cv10, 
  metrics = metric_set(rmse, mae),
  grid = penalty_grid 
)

autoplot(tune_output) + theme_classic()

# Summarize Model Evaluation Metrics (CV)
collect_metrics(tune_output)
## # A tibble: 60 × 7
##     penalty .metric .estimator  mean     n std_err .config              
##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1 0.0001   mae     standard    1.03    10 0.00129 Preprocessor1_Model01
##  2 0.0001   rmse    standard    1.35    10 0.00172 Preprocessor1_Model01
##  3 0.000149 mae     standard    1.03    10 0.00129 Preprocessor1_Model02
##  4 0.000149 rmse    standard    1.35    10 0.00172 Preprocessor1_Model02
##  5 0.000221 mae     standard    1.03    10 0.00129 Preprocessor1_Model03
##  6 0.000221 rmse    standard    1.35    10 0.00172 Preprocessor1_Model03
##  7 0.000329 mae     standard    1.03    10 0.00129 Preprocessor1_Model04
##  8 0.000329 rmse    standard    1.35    10 0.00172 Preprocessor1_Model04
##  9 0.000489 mae     standard    1.03    10 0.00129 Preprocessor1_Model05
## 10 0.000489 rmse    standard    1.35    10 0.00171 Preprocessor1_Model05
## # … with 50 more rows
# Choose largest penalty value within 1 se
best_se_penalty <- select_by_one_std_err(tune_output, metric = 'mae', desc(penalty)) 
best_se_penalty 
## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n std_err .config            .best .bound
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>              <dbl>  <dbl>
## 1 0.00788 mae     standard    1.03    10 0.00126 Preprocessor1_Mod…  1.03   1.03
# Fit Final Model based on our full 500k dataset 
final_wf_se <- finalize_workflow(lasso_wf_tune, best_se_penalty) # incorporates penalty value to workflow
final_fit_se <- fit(final_wf_se, data = Hotel_Reviews)

# Look at each variables' importance
tidy(final_fit_se) %>%
  filter(estimate != 0.0000000) %>%
  mutate(importance = abs(estimate)) %>%
  arrange(desc(importance))
## # A tibble: 42 × 4
##    term                                      estimate penalty importance
##    <chr>                                        <dbl>   <dbl>      <dbl>
##  1 Review_Total_Negative_Word_Counts           -0.608 0.00788      0.608
##  2 Review_Total_Positive_Word_Counts            0.362 0.00788      0.362
##  3 Reviewer_Nationality_Israel                  0.358 0.00788      0.358
##  4 Reviewer_Nationality_United.Kingdom          0.227 0.00788      0.227
##  5 Reviewer_Nationality_Iran                   -0.192 0.00788      0.192
##  6 Reviewer_Nationality_Ireland                 0.188 0.00788      0.188
##  7 Reviewer_continent_Asia                     -0.177 0.00788      0.177
##  8 Reviewer_continent_Americas                  0.164 0.00788      0.164
##  9 Reviewer_Nationality_United.Arab.Emirates   -0.162 0.00788      0.162
## 10 Reviewer_continent_Oceania                   0.161 0.00788      0.161
## # … with 32 more rows
# Evaluating
final_fit_se %>% tidy() %>% filter(estimate != 0)
## # A tibble: 42 × 3
##    term                              estimate penalty
##    <chr>                                <dbl>   <dbl>
##  1 (Intercept)                        -0.117  0.00788
##  2 Average_Score                      -0.0534 0.00788
##  3 Review_Total_Negative_Word_Counts  -0.608  0.00788
##  4 Review_Total_Positive_Word_Counts   0.362  0.00788
##  5 Reviewer_Nationality_Belgium       -0.0421 0.00788
##  6 Reviewer_Nationality_China          0.100  0.00788
##  7 Reviewer_Nationality_Cyprus         0.128  0.00788
##  8 Reviewer_Nationality_France        -0.0176 0.00788
##  9 Reviewer_Nationality_Germany       -0.0331 0.00788
## 10 Reviewer_Nationality_Hong.Kong     -0.0306 0.00788
## # … with 32 more rows
tune_output %>% collect_metrics() %>% filter(penalty == (best_se_penalty %>% pull(penalty)))
## # A tibble: 2 × 7
##   penalty .metric .estimator  mean     n std_err .config              
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1 0.00788 mae     standard    1.03    10 0.00126 Preprocessor1_Model12
## 2 0.00788 rmse    standard    1.35    10 0.00169 Preprocessor1_Model12
# Visual residuals
lasso_mod_output <- final_fit_se %>%
    predict(new_data = Hotel_Reviews) %>%
    bind_cols(Hotel_Reviews) %>%
    mutate(resid = Reviewer_Average_Difference - .pred)

ggplot(lasso_mod_output, aes(x = .pred, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic()

glmnet_output <- final_fit_se %>% extract_fit_parsnip() %>% pluck('fit') # get the original glmnet output

lambdas <- glmnet_output$lambda

# Visualize LASSO lambda result
coefs_lambdas <- 
  coefficients(glmnet_output, s = lambdas )  %>% 
  as.matrix() %>%  
  t() %>% 
  as.data.frame() %>% 
  mutate(lambda = lambdas ) %>% 
  select(lambda, everything(), -`(Intercept)`) %>% 
  pivot_longer(cols = -lambda, 
               names_to = "term", 
               values_to = "coef") %>%
  mutate(var = purrr::map_chr(stringr::str_split(term,"_"),~.[1]))

coefs_lambdas %>%
  ggplot(aes(x = lambda, y = coef, group = term, color = var)) +
  geom_line() +
  geom_vline(xintercept = best_se_penalty %>% pull(penalty), linetype = 'dashed') + 
  theme_classic() + 
  theme(legend.position = "bottom", legend.text=element_text(size=8))

# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- glmnet_output$beta==0

# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
    # Extract coefficient path (sorted from highest to lowest lambda)
    this_coeff_path <- bool_predictor_exclude[row,]
    # Compute and return the # of lambdas until this variable is out forever
    ncol(bool_predictor_exclude) - which.min(this_coeff_path) + 1
})

# Create a dataset of this information and sort
var_imp_data <- tibble(
    var_name = rownames(bool_predictor_exclude),
    var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp))
## # A tibble: 263 × 2
##    var_name                            var_imp
##    <chr>                                 <dbl>
##  1 Total_Number_of_Reviews                  82
##  2 lat                                      82
##  3 Reviewer_Nationality_Antarctica          82
##  4 Reviewer_Nationality_Armenia             82
##  5 Reviewer_Nationality_Australia           82
##  6 Reviewer_Nationality_Cameroon            82
##  7 Reviewer_Nationality_Canada              82
##  8 Reviewer_Nationality_Cape.Verde          82
##  9 Reviewer_Nationality_Cayman.Islands      82
## 10 Reviewer_Nationality_Ecuador             82
## # … with 253 more rows
var_imp_data_delete_Nationality <- var_imp_data %>% filter(str_detect(var_name, "Reviewer_Nationality_")==FALSE, str_detect(var_name, "lat")==FALSE, str_detect(var_name, "Month_")==FALSE)
var_imp_data_delete_Nationality %>% arrange(desc(var_imp))
## # A tibble: 24 × 2
##    var_name                          var_imp
##    <chr>                               <dbl>
##  1 Total_Number_of_Reviews                82
##  2 Hotel_Country_unknown                  82
##  3 Reviewer_continent_Europe              82
##  4 Reviewer_continent_unknown             82
##  5 season_unknown                         82
##  6 Review_Total_Negative_Word_Counts      81
##  7 Review_Total_Positive_Word_Counts      76
##  8 Reviewer_continent_Asia                66
##  9 Average_Score                          55
## 10 Reviewer_continent_Americas            55
## # … with 14 more rows

GAMs

# Remove month, year, nationality
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "month")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Month")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "day")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Reviewer_Nationality")]

# Create new variables
Hotel_Reviews$Review_Total_Positive_Word_Percentage <- Hotel_Reviews$Review_Total_Positive_Word_Counts / (Hotel_Reviews$Review_Total_Negative_Word_Counts + Hotel_Reviews$Review_Total_Positive_Word_Counts)

Hotel_Reviews$Review_Total_Positive_Word_Percentage <- formatC(Hotel_Reviews$Review_Total_Positive_Word_Percentage, digits = 2, format = "f")

Hotel_Reviews$Review_Total_Positive_Word_Percentage <- as.numeric(Hotel_Reviews$Review_Total_Positive_Word_Percentage)


Hotel_Reviews$Review_Total_Negative_Word_Percentage <- Hotel_Reviews$Review_Total_Negative_Word_Counts / (Hotel_Reviews$Review_Total_Negative_Word_Counts + Hotel_Reviews$Review_Total_Positive_Word_Counts)

Hotel_Reviews$Review_Total_Negative_Word_Percentage <- formatC(Hotel_Reviews$Review_Total_Positive_Word_Percentage, digits = 2, format = "f")

Hotel_Reviews$Review_Total_Negative_Word_Percentage <- as.numeric(Hotel_Reviews$Review_Total_Negative_Word_Percentage)

Hotel_Reviews <- Hotel_Reviews %>% filter(!Review_Total_Negative_Word_Percentage == "NaN")

Hotel_Reviews <- Hotel_Reviews %>% filter(!Review_Total_Positive_Word_Percentage == "NaN")

Hotel_Reviews <- Hotel_Reviews %>% na_if("") %>% na.omit
# Visualize non-linear before GAMs
Hotel_Reviews %>%
    ggplot(aes(x = Review_Total_Negative_Word_Counts, y = Reviewer_Average_Difference, color = Reviewer_continent)) + 
    geom_point(alpha = 0.2) + 
    geom_smooth(span = 0.2, se = FALSE) +
    theme_classic()

Hotel_Reviews %>%
    ggplot(aes(x = Review_Total_Positive_Word_Counts, y = Reviewer_Average_Difference, color = Reviewer_continent)) + 
    geom_point(alpha = 0.2) + 
    geom_smooth(span = 0.2, se = FALSE) +
    theme_classic()

GAMs with Counts
# Generalized Additive Regression (GAM) Model
gam_spec <- 
  gen_additive_mod() %>%
  set_engine(engine = 'mgcv') %>%
  set_mode('regression') 

fit_gam_model <- gam_spec %>% 
  fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts) + s(Review_Total_Positive_Word_Counts) + s(Average_Score) + Reviewer_continent + season, data = Hotel_Reviews)  

# Summary: Parameter (linear) estimates and then Smooth Terms (H0: no relationship)
fit_gam_model %>% pluck('fit') %>% summary() 
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts) + 
##     s(Review_Total_Positive_Word_Counts) + s(Average_Score) + 
##     Reviewer_continent + season
## 
## Parametric coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -0.164874   0.013996 -11.780  < 2e-16 ***
## Reviewer_continentAmericas  0.260702   0.014823  17.587  < 2e-16 ***
## Reviewer_continentAsia     -0.161955   0.014401 -11.246  < 2e-16 ***
## Reviewer_continentEurope    0.160857   0.013774  11.678  < 2e-16 ***
## Reviewer_continentOceania   0.154822   0.015863   9.760  < 2e-16 ***
## seasonSpring                0.041426   0.005147   8.048 8.41e-16 ***
## seasonSummer                0.002059   0.005074   0.406    0.685    
## seasonWinter                0.123795   0.005253  23.565  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                        edf Ref.df       F p-value    
## s(Review_Total_Negative_Word_Counts) 8.926  8.997 16448.5  <2e-16 ***
## s(Review_Total_Positive_Word_Counts) 8.972  9.000  8469.4  <2e-16 ***
## s(Average_Score)                     7.979  8.692   548.7  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.296   Deviance explained = 29.6%
## GCV = 1.6401  Scale est. = 1.64      n = 510506
# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_model %>% pluck('fit') %>% mgcv::gam.check() 

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 12 iterations.
## The RMS GCV score gradient at convergence was 2.111654e-07 .
## The Hessian was positive definite.
## Model rank =  35 / 35 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                        k'  edf k-index p-value  
## s(Review_Total_Negative_Word_Counts) 9.00 8.93    0.98   0.055 .
## s(Review_Total_Positive_Word_Counts) 9.00 8.97    0.97   0.055 .
## s(Average_Score)                     9.00 7.98    0.99   0.175  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot functions for each predictor
# Visualize: Look at the estimated non-linear functions 
# Dashed lines are +/- 2 SEs
fit_gam_model %>% pluck('fit') %>% plot( pages = 1)

fit_gam_model_2 <- gam_spec %>% 
  fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts, k = 20) + s(Review_Total_Positive_Word_Counts, k = 20)+ Reviewer_continent + season + s(Average_Score, k = 20), data = Hotel_Reviews)  

# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_model_2 %>% pluck('fit') %>% mgcv::gam.check() 

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 11 iterations.
## The RMS GCV score gradient at convergence was 3.533854e-07 .
## The Hessian was positive definite.
## Model rank =  65 / 65 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                        k'  edf k-index p-value
## s(Review_Total_Negative_Word_Counts) 19.0 16.9    1.00    0.52
## s(Review_Total_Positive_Word_Counts) 19.0 18.8    0.99    0.21
## s(Average_Score)                     19.0 15.6    0.99    0.33
fit_gam_model_3 <- gam_spec %>% 
  fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts, k = 15) + s(Review_Total_Positive_Word_Counts, k = 15) + Reviewer_continent + season + s(Average_Score, k = 15), data = Hotel_Reviews)  

# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_model_3 %>% pluck('fit') %>% mgcv::gam.check() 

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 11 iterations.
## The RMS GCV score gradient at convergence was 3.898239e-07 .
## The Hessian was positive definite.
## Model rank =  50 / 50 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                        k'  edf k-index p-value  
## s(Review_Total_Negative_Word_Counts) 14.0 13.9    0.98   0.085 .
## s(Review_Total_Positive_Word_Counts) 14.0 14.0    0.97   0.055 .
## s(Average_Score)                     14.0 13.5    0.98   0.065 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Our final GAM model with 20 breakpoints:
fit_gam_model_final <- gam_spec %>% 
  fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Counts, k = 10) + s(Review_Total_Positive_Word_Counts, k = 10)+ Reviewer_continent + season + s(Average_Score, k = 10), data = Hotel_Reviews)  

fit_gam_model_final %>% pluck('fit') %>% plot( pages = 1)

GAMs with Percentage
# Generalized Additive Regression (GAM) Model
gam_percentage_spec <- 
  gen_additive_mod() %>%
  set_engine(engine = 'mgcv') %>%
  set_mode('regression') 

fit_gam_percentage_model <- gam_percentage_spec %>% 
  fit(Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Percentage) + s(Review_Total_Positive_Word_Percentage) + s(Average_Score) + Reviewer_continent + season, data = Hotel_Reviews)  

# Summary: Parameter (linear) estimates and then Smooth Terms (H0: no relationship)
fit_gam_percentage_model %>% pluck('fit') %>% summary() 
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Reviewer_Average_Difference ~ s(Review_Total_Negative_Word_Percentage) + 
##     s(Review_Total_Positive_Word_Percentage) + s(Average_Score) + 
##     Reviewer_continent + season
## 
## Parametric coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -0.165343   0.013712 -12.058  < 2e-16 ***
## Reviewer_continentAmericas  0.266811   0.014519  18.376  < 2e-16 ***
## Reviewer_continentAsia     -0.160449   0.014117 -11.366  < 2e-16 ***
## Reviewer_continentEurope    0.161555   0.013497  11.969  < 2e-16 ***
## Reviewer_continentOceania   0.155038   0.015542   9.975  < 2e-16 ***
## seasonSpring                0.040847   0.005044   8.099 5.57e-16 ***
## seasonSummer                0.002200   0.004973   0.442    0.658    
## seasonWinter                0.120793   0.005149  23.458  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                            edf Ref.df       F p-value    
## s(Review_Total_Negative_Word_Percentage) 7.545  7.546   0.978   0.552    
## s(Review_Total_Positive_Word_Percentage) 1.453  1.454   0.085   0.906    
## s(Average_Score)                         8.083  8.747 702.516  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 34/35
## R-sq.(adj) =  0.323   Deviance explained = 32.3%
## GCV =  1.576  Scale est. = 1.576     n = 510506
# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
par(mfrow=c(2,2))
fit_gam_percentage_model %>% pluck('fit') %>% mgcv::gam.check() 

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 15 iterations.
## The RMS GCV score gradient at convergence was 2.373486e-07 .
## The Hessian was not positive definite.
## Model rank =  34 / 35 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                            k'  edf k-index p-value  
## s(Review_Total_Negative_Word_Percentage) 9.00 7.54    0.97   0.025 *
## s(Review_Total_Positive_Word_Percentage) 9.00 1.45    0.97   0.035 *
## s(Average_Score)                         9.00 8.08    1.01   0.715  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot functions for each predictor
# Visualize: Look at the estimated non-linear functions 
# Dashed lines are +/- 2 SEs
fit_gam_percentage_model %>% pluck('fit') %>% plot( pages = 1)

Classification Model

Clean data for classification

# Create binary variable for classification
Hotel_Reviews$Reviewer_Average_Difference_Categorical <- ifelse(Hotel_Reviews$Reviewer_Average_Difference > 0, TRUE, FALSE) 

Hotel_Reviews$Reviewer_Average_Difference_Categorical <- as.factor(Hotel_Reviews$Reviewer_Average_Difference_Categorical)

Hotel_Reviews <- Hotel_Reviews %>% filter(!is.na(Reviewer_continent))

# Remove all unnecessary variables + variables that create the Reviewer_Average_Difference_Categorical
#Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Reviewer_Nationality")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "lat")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "lng")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Reviewer_Average_Difference")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Review_Total_Positive_Word_Percentage")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Review_Total_Negative_Word_Percentage")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Reviewer_Score")]
Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Average_Score")]

One Single Tree

data_fold <- vfold_cv(Hotel_Reviews, v = 10)

ct_spec_tune <- decision_tree() %>%
  set_engine(engine = 'rpart') %>%
  set_args(cost_complexity = tune(),  
           min_n = 2, 
           tree_depth = NULL) %>% 
  set_mode('classification') 

data_rec <- recipe(Reviewer_Average_Difference_Categorical~ ., data = Hotel_Reviews)

data_wf_tune <- workflow() %>%
  add_model(ct_spec_tune) %>%
  add_recipe(data_rec)

param_grid <- grid_regular(cost_complexity(range = c(-5, 1)), levels = 10) 

tune_res <- tune_grid(
  data_wf_tune, 
  resamples = data_fold, 
  grid = param_grid, 
  metrics = metric_set(accuracy) #change this for regression trees
)

autoplot(tune_res) + theme_classic()

best_complexity <- select_by_one_std_err(tune_res, metric = 'accuracy', desc(cost_complexity))
data_wf_final <- finalize_workflow(data_wf_tune, best_complexity)

hotel_final_fit <- fit(data_wf_final, data = Hotel_Reviews)

tune_res %>% 
  collect_metrics() %>%
  filter(cost_complexity == best_complexity %>% pull(cost_complexity))
## # A tibble: 1 × 7
##   cost_complexity .metric  .estimator  mean     n  std_err .config              
##             <dbl> <chr>    <chr>      <dbl> <int>    <dbl> <chr>                
## 1        0.000215 accuracy binary     0.729    10 0.000680 Preprocessor1_Model03
library("rpart.plot")
hotel_final_fit %>% extract_fit_engine() %>% rpart.plot()

tree_mod_highcp <- fit(
    data_wf_tune %>%
        update_model(ct_spec_tune %>% set_args(cost_complexity = .1)),
    data = Hotel_Reviews
)

tree_mod_highcp %>% extract_fit_engine() %>% rpart.plot()

# The best single tree ever!
tree_mod_lowcp <- fit(
    data_wf_tune %>%
        update_model(ct_spec_tune %>% set_args(cost_complexity = .01)),
    data = Hotel_Reviews
)

tree_mod_lowcp %>% extract_fit_engine() %>% rpart.plot()

Random Forest

# Recipe
data_rec <- recipe(Reviewer_Average_Difference_Categorical ~ ., data = Hotel_Reviews) %>% 
  step_nzv(all_predictors()) %>%
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) 

# Model Specification
rf_spec <- rand_forest() %>%
  set_engine(engine = 'ranger') %>% 
  set_args(mtry = NULL, 
           trees = 1000, 
           min_n = 2,
           probability = FALSE, # FALSE: get hard predictions (not needed for regression)
           importance = 'impurity') %>% 
  set_mode('classification')

# Workflows
data_wf_mtry <- workflow() %>%
  add_model(rf_spec) %>%
  add_recipe(data_rec)

# Fit Models
data_fit_mtry <- fit(data_wf_mtry, data = Hotel_Reviews)
# Evaluation
# Custom Function to get OOB predictions, true observed outcomes and add a user-provided model label
rf_OOB_output <- function(fit_model, model_label, truth){
    tibble(
          .pred_class = fit_model %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
          Reviewer_Average_Difference_Categorical = truth,
          label = model_label
      )
}

#check out the function output
rf_OOB_output(data_fit_mtry, TRUE, Hotel_Reviews %>% pull(Reviewer_Average_Difference_Categorical))
## # A tibble: 510,506 × 3
##    .pred_class Reviewer_Average_Difference_Categorical label
##    <fct>       <fct>                                   <lgl>
##  1 FALSE       FALSE                                   TRUE 
##  2 TRUE        FALSE                                   TRUE 
##  3 TRUE        FALSE                                   TRUE 
##  4 FALSE       FALSE                                   TRUE 
##  5 FALSE       FALSE                                   TRUE 
##  6 TRUE        FALSE                                   TRUE 
##  7 FALSE       FALSE                                   TRUE 
##  8 TRUE        TRUE                                    TRUE 
##  9 FALSE       FALSE                                   TRUE 
## 10 TRUE        TRUE                                    TRUE 
## # … with 510,496 more rows
# Evaluate OOB Metrics
data_rf_OOB_output <- bind_rows(
    rf_OOB_output(data_fit_mtry, TRUE, Hotel_Reviews %>% pull(Reviewer_Average_Difference_Categorical))
)

data_rf_OOB_output %>% 
    accuracy(truth = Reviewer_Average_Difference_Categorical, estimate = .pred_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.730
# OOB prediction error 
confusion <- rf_OOB_output(data_fit_mtry,TRUE, Hotel_Reviews %>% pull(Reviewer_Average_Difference_Categorical)) %>%
    conf_mat(truth = Reviewer_Average_Difference_Categorical, estimate = .pred_class)
confusion
##           Truth
## Prediction  FALSE   TRUE
##      FALSE 127926  54541
##      TRUE   83074 244965
# Variable importance measurement
model_output <-data_fit_mtry %>% 
    extract_fit_engine() 

model_output %>% 
    vip(num_features = 30) + theme_classic() #based on impurity

model_output %>% vip::vi() %>% head()
## # A tibble: 6 × 2
##   Variable                                   Importance
##   <chr>                                           <dbl>
## 1 Review_Total_Negative_Word_Counts              42371.
## 2 Review_Total_Positive_Word_Counts              23336.
## 3 Total_Number_of_Reviews                        10535.
## 4 Additional_Number_of_Scoring                   10301.
## 5 Total_Number_of_Reviews_Reviewer_Has_Given      7163.
## 6 year                                            2011.
model_output %>% vip::vi() %>% tail()
## # A tibble: 6 × 2
##   Variable                   Importance
##   <chr>                           <dbl>
## 1 Hotel_Country_Netherlands        563.
## 2 Hotel_Country_Italy              562.
## 3 Reviewer_continent_Oceania       404.
## 4 Hotel_Country_new                  0 
## 5 Reviewer_continent_new             0 
## 6 season_new                         0
ggplot(Hotel_Reviews, aes(x = Reviewer_Average_Difference_Categorical, y = Review_Total_Negative_Word_Counts)) +
    geom_violin() + theme_classic()

ggplot(Hotel_Reviews, aes(x = Reviewer_Average_Difference_Categorical, y = Review_Total_Positive_Word_Counts)) +
    geom_violin() + theme_classic()

LASSO for Logistic

# Set reference level (to the outcome you are NOT interested in)
Hotel_Reviews <- Hotel_Reviews%>%
  mutate(Reviewer_Average_Difference_Categorical = relevel(factor(Reviewer_Average_Difference_Categorical), ref='FALSE')) 

data_cv10 <- vfold_cv(Hotel_Reviews, v = 10)

# Logistic LASSO Regression Model Spec
logistic_lasso_spec_tune <- logistic_reg() %>%
    set_engine('glmnet') %>%
    set_args(mixture = 1, penalty = tune()) %>%
    set_mode('classification')

# Recipe
logistic_rec <- recipe(Reviewer_Average_Difference_Categorical ~ ., data = Hotel_Reviews) %>%
    step_normalize(all_numeric_predictors()) %>% 
    step_dummy(all_nominal_predictors())

# Workflow (Recipe + Model)
log_lasso_wf <- workflow() %>% 
    add_recipe(logistic_rec) %>%
    add_model(logistic_lasso_spec_tune) 

# Tune Model (trying a variety of values of Lambda penalty)
penalty_grid <- grid_regular(
  penalty(range = c(-5, 1)), #log10 transformed  (kept moving min down from 0)
  levels = 100)

tune_output <- tune_grid( 
  log_lasso_wf, # workflow
  resamples = data_cv10, # cv folds
  metrics = metric_set(roc_auc,accuracy),
  control = control_resamples(save_pred = TRUE, event_level = 'second'),
  grid = penalty_grid # penalty grid defined above
)

# Visualize Model Evaluation Metrics from Tuning
autoplot(tune_output) + theme_classic()

# Select Penalty
best_se_penalty <- select_by_one_std_err(tune_output, metric = 'roc_auc', desc(penalty)) # choose penalty value based on the largest penalty within 1 se of the highest CV roc_auc
best_se_penalty
## # A tibble: 1 × 9
##   penalty .metric .estimator  mean     n  std_err .config           .best .bound
##     <dbl> <chr>   <chr>      <dbl> <int>    <dbl> <chr>             <dbl>  <dbl>
## 1  0.0248 roc_auc binary     0.776    10 0.000621 Preprocessor1_Mo… 0.777  0.776
# Fit Final Model
final_fit_se <- finalize_workflow(log_lasso_wf, best_se_penalty) %>% # incorporates penalty value to workflow 
    fit(data = Hotel_Reviews)

final_fit_se %>% tidy() %>%
  filter(estimate == 0)
## # A tibble: 15 × 3
##    term                                       estimate penalty
##    <chr>                                         <dbl>   <dbl>
##  1 Additional_Number_of_Scoring                      0  0.0248
##  2 year                                              0  0.0248
##  3 Total_Number_of_Reviews                           0  0.0248
##  4 Total_Number_of_Reviews_Reviewer_Has_Given        0  0.0248
##  5 Hotel_Country_France                              0  0.0248
##  6 Hotel_Country_Italy                               0  0.0248
##  7 Hotel_Country_Netherlands                         0  0.0248
##  8 Hotel_Country_Spain                               0  0.0248
##  9 Hotel_Country_United.Kingdom                      0  0.0248
## 10 Reviewer_continent_Americas                       0  0.0248
## 11 Reviewer_continent_Europe                         0  0.0248
## 12 Reviewer_continent_Oceania                        0  0.0248
## 13 season_Spring                                     0  0.0248
## 14 season_Summer                                     0  0.0248
## 15 season_Winter                                     0  0.0248
final_fit_se %>% tidy() %>%
  filter(estimate != 0)
## # A tibble: 4 × 3
##   term                              estimate penalty
##   <chr>                                <dbl>   <dbl>
## 1 (Intercept)                          0.371  0.0248
## 2 Review_Total_Negative_Word_Counts   -0.847  0.0248
## 3 Review_Total_Positive_Word_Counts    0.532  0.0248
## 4 Reviewer_continent_Asia             -0.125  0.0248
glmnet_output <- final_fit_se %>% extract_fit_engine()
    
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- glmnet_output$beta==0

# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
    # Extract coefficient path (sorted from highest to lowest lambda)
    this_coeff_path <- bool_predictor_exclude[row,]
    # Compute and return the # of lambdas until this variable is out forever
    ncol(bool_predictor_exclude) - which.min(this_coeff_path) + 1
})

# Create a dataset of this information and sort
var_imp_data <- tibble(
    var_name = rownames(bool_predictor_exclude),
    var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp))
## # A tibble: 18 × 2
##    var_name                                   var_imp
##    <chr>                                        <dbl>
##  1 year                                            63
##  2 Hotel_Country_Spain                             63
##  3 season_Summer                                   63
##  4 Review_Total_Negative_Word_Counts               62
##  5 Review_Total_Positive_Word_Counts               58
##  6 Reviewer_continent_Asia                         47
##  7 Reviewer_continent_Americas                     39
##  8 Total_Number_of_Reviews_Reviewer_Has_Given      36
##  9 season_Winter                                   36
## 10 Hotel_Country_Netherlands                       32
## 11 Hotel_Country_Italy                             31
## 12 Additional_Number_of_Scoring                    22
## 13 Hotel_Country_United.Kingdom                    20
## 14 season_Spring                                   19
## 15 Reviewer_continent_Europe                       18
## 16 Reviewer_continent_Oceania                      14
## 17 Hotel_Country_France                            10
## 18 Total_Number_of_Reviews                          9
# CV results for "best lambda"
tune_output %>%
    collect_metrics() %>%
    filter(penalty == best_se_penalty %>% pull(penalty))
## # A tibble: 2 × 7
##   penalty .metric  .estimator  mean     n  std_err .config               
##     <dbl> <chr>    <chr>      <dbl> <int>    <dbl> <chr>                 
## 1  0.0248 accuracy binary     0.692    10 0.000738 Preprocessor1_Model057
## 2  0.0248 roc_auc  binary     0.776    10 0.000621 Preprocessor1_Model057
# Count up number of T/F in the training data
Hotel_Reviews %>%
    count(Reviewer_Average_Difference_Categorical) # Name of the outcome variable goes inside count()
## # A tibble: 2 × 2
##   Reviewer_Average_Difference_Categorical      n
##   <fct>                                    <int>
## 1 FALSE                                   211000
## 2 TRUE                                    299506
# Compute the NIR
299575/(299575+211057)
## [1] 0.5866749
# Soft Predictions on Training Data
final_output <- final_fit_se %>% 
  predict(new_data = Hotel_Reviews, type='prob') %>% 
  bind_cols(Hotel_Reviews)

final_output %>%
  ggplot(aes(x = Reviewer_Average_Difference_Categorical, y = .pred_TRUE)) +
  geom_boxplot()

# Use soft predictions
final_output %>%
    roc_curve(Reviewer_Average_Difference_Categorical,.pred_TRUE,event_level = 'second') %>%
    autoplot()

# Thresholds in terms of reference level
threshold_output <- final_output %>%
    threshold_perf(truth = Reviewer_Average_Difference_Categorical, estimate = .pred_FALSE, thresholds = seq(0,1,by=.01)) 

# J-index v. threshold for reviewer_Average_Difference_Categorical
threshold_output %>%
    filter(.metric == 'j_index') %>%
    ggplot(aes(x = .threshold, y = .estimate)) +
    geom_line() +
    labs(y = 'J-index', x = 'threshold') +
    theme_classic()

threshold_output %>%
    filter(.metric == 'distance') %>%
    arrange(.estimate)
## # A tibble: 101 × 4
##    .threshold .metric  .estimator .estimate
##         <dbl> <chr>    <chr>          <dbl>
##  1       0.4  distance binary         0.160
##  2       0.39 distance binary         0.162
##  3       0.41 distance binary         0.169
##  4       0.38 distance binary         0.176
##  5       0.42 distance binary         0.186
##  6       0.37 distance binary         0.200
##  7       0.43 distance binary         0.207
##  8       0.44 distance binary         0.233
##  9       0.36 distance binary         0.235
## 10       0.45 distance binary         0.261
## # … with 91 more rows
log_metrics <- metric_set(accuracy,sens,yardstick::spec)

#Accuracy + Specificity + Sensitivity 
final_output %>%
    mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(Reviewer_Average_Difference_Categorical
), threshold =  0.40)) %>%
    log_metrics(truth = Reviewer_Average_Difference_Categorical
, estimate = .pred_class, event_level = 'second')
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.718
## 2 sens     binary         0.720
## 3 spec     binary         0.715

Clustering Model

K-Means

# Reload original cvs to bring back some variables
Hotel_Reviews <- read_csv("data.csv")

#Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Hotel_Name")]
#Hotel_Reviews <- Hotel_Reviews[, -which(names(Hotel_Reviews) == "Tags")]
# Select just 3 variables
Hotel_Reviews_sub <- Hotel_Reviews %>%
    select(Review_Total_Positive_Word_Counts, Reviewer_Score, Review_Total_Negative_Word_Counts)

# Run k-means for k = centers = 3
set.seed(253)
kclust_k3 <- kmeans(Hotel_Reviews_sub, centers = 3)

# Data-specific function to cluster and calculate total within-cluster SS
hotel_cluster_ss <- function(k){
    # Perform clustering
    kclust3 <- kmeans(scale(Hotel_Reviews_sub), centers = 3)

    # Return the total within-cluster sum of squares
    return(kclust3$tot.withinss)
}
hotel_cluster_ss
## function(k){
##     # Perform clustering
##     kclust3 <- kmeans(scale(Hotel_Reviews_sub), centers = 3)
## 
##     # Return the total within-cluster sum of squares
##     return(kclust3$tot.withinss)
## }
# Add a variable (kclust_k3) to the original dataset 
# containing the cluster assignments
Hotel_Reviews <- Hotel_Reviews %>%
    mutate(kclust_3 = factor(kclust_k3$cluster))

# Visualize the cluster assignments on the original scatterplot
ggplot(Hotel_Reviews, aes(x = Review_Total_Positive_Word_Counts, y = Review_Total_Negative_Word_Counts, color = kclust_3)) + geom_point() + theme_classic()

# 3D plot
fig_3D <- plot_ly(Hotel_Reviews, x=~Review_Total_Positive_Word_Counts, y=~Review_Total_Negative_Word_Counts, z=~Reviewer_Score, color =~kclust_3) %>%
  add_markers(size=1) 

fig_3D